library(tidyverse)
library(stringr)
library(caret)
library(plotly)
library(ggthemes)
library(GGally)
library(class)
library(e1071)
library(stringr)

1 Introduction

This analysis is about the number of beers consumed by American People in the United States In this docoument we conduct an analysis about

1.0.1 Reading data files

1.0.2 Addressing the missing values in each column

#Cleaning ABV using mean
df_beers_cl0 = df_beers
nr_mean_abv = mean(df_beers_cl0[!is.na(df_beers_cl0$ABV),]$ABV)
length_abv = length(df_beers_cl0[is.na(df_beers_cl0$ABV),]$ABV)
if(length_abv > 0){
  df_beers_cl0[is.na(df_beers_cl0$ABV),]$ABV = nr_mean_abv
}

#Cleaning IBU using mean
df_beers_cl1 = df_beers
nr_mean_ibu = mean(df_beers_cl1[!is.na(df_beers_cl1$IBU),]$IBU)
length_ibu = length(df_beers_cl1[is.na(df_beers_cl1$IBU),]$IBU)
if(length_ibu > 0){
  df_beers_cl1[is.na(df_beers_cl1$IBU),]$IBU = nr_mean_ibu
}


#Cleaning using KnnInpute
# preProcValues <- preProcess(df_beers %>% 
#                           select(ABV,IBU),
#                             method = c("knnImpute"),
#                             k = 20,
#                             knnSummary = mean)
# df_beers_unp <- predict(preProcValues, df_beers,na.action = na.pass)
# procNames <- data.frame(col = names(preProcValues$mean), mean = preProcValues$mean, sd = preProcValues$std)
# for(i in procNames$col){
#  df_beer_info[i] <- df_beer_info[i]*preProcValues$std[i]+preProcValues$mean[i] 
# }
knn_imp_model <- preProcess(df_beers_cl0 %>%
                          select(ABV,IBU),
                            method = c("knnImpute"),
                            k = 20,
                            knnSummary = mean)



df_beers_unp <- predict(knn_imp_model, df_beers_cl0,na.action = na.pass)
procNames <- data.frame(col = names(knn_imp_model$mean), mean = knn_imp_model$mean, sd = knn_imp_model$std)
for(i in procNames$col){
 df_beers_unp[i] <- df_beers_unp[i]*knn_imp_model$std[i]+knn_imp_model$mean[i] 
}


#Cleansing beers, records 2410
nr_rows = dim(df_beers)[1]
#Finding the NAN values
#summary(df_beers)
# df_beers  %>% ggplot(aes(x=IBU))+geom_histogram(aes(fill="green")) 
# df_beers  %>% ggplot(aes(x=ABV))+geom_histogram(aes(fill="green")) 
# 
# df_beers_cl1  %>% ggplot(aes(x=IBU))+geom_histogram(aes(fill="blue"))
# df_beers_cl1  %>% ggplot(aes(x=ABV))+geom_histogram(aes(fill="blue"))

df_beers_unp  %>% ggplot(aes(x=IBU))+geom_histogram() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df_beers_unp  %>% ggplot(aes(x=ABV))+geom_histogram() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df_beers_unp %>% ggplot(aes(y=ABV))+geom_boxplot()

summary(df_beers_unp)
##      Name              Beer_ID            ABV               IBU        
##  Length:2410        Min.   :   1.0   Min.   :0.00100   Min.   :  4.00  
##  Class :character   1st Qu.: 808.2   1st Qu.:0.05000   1st Qu.: 24.20  
##  Mode  :character   Median :1453.5   Median :0.05700   Median : 36.70  
##                     Mean   :1431.1   Mean   :0.05977   Mean   : 42.92  
##                     3rd Qu.:2075.8   3rd Qu.:0.06700   3rd Qu.: 60.00  
##                     Max.   :2692.0   Max.   :0.12800   Max.   :138.00  
##    Brewery_id       Style               Ounces     
##  Min.   :  1.0   Length:2410        Min.   : 8.40  
##  1st Qu.: 94.0   Class :character   1st Qu.:12.00  
##  Median :206.0   Mode  :character   Median :12.00  
##  Mean   :232.7                      Mean   :13.59  
##  3rd Qu.:367.0                      3rd Qu.:16.00  
##  Max.   :558.0                      Max.   :32.00

1.0.3 How many breweries are present in each state

df_summary = df_breweries_2 %>% group_by(State,Name_State) %>% summarize(NumberBreweries = n())
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
knitr::kable(
  df_summary,
  caption = "Number of Beers by State"
)
Number of Beers by State
State Name_State NumberBreweries
AK Alaska 7
AL Alabama 3
AR Arkansas 2
AZ Arizona 11
CA California 39
CO Colorado 47
CT Connecticut 8
DC District of Columbia 1
DE Delaware 2
FL Florida 15
GA Georgia 7
HI Hawaii 4
IA Iowa 5
ID Idaho 5
IL Illinois 18
IN Indiana 22
KS Kansas 3
KY Kentucky 4
LA Louisiana 5
MA Massachusetts 23
MD Maryland 7
ME Maine 9
MI Michigan 32
MN Minnesota 12
MO Missouri 9
MS Mississippi 2
MT Montana 9
NC North Carolina 19
ND North Dakota 1
NE Nebraska 5
NH New Hampshire 3
NJ New Jersey 3
NM New Mexico 4
NV Nevada 2
NY New York 16
OH Ohio 15
OK Oklahoma 6
OR Oregon 29
PA Pennsylvania 25
RI Rhode Island 5
SC South Carolina 4
SD South Dakota 1
TN Tennessee 3
TX Texas 28
UT Utah 4
VA Virginia 16
VT Vermont 10
WA Washington 23
WI Wisconsin 20
WV West Virginia 1
WY Wyoming 4

1.0.4 Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file

df_beerbre_unp = merge(df_beers_unp,df_breweries_2,by.x = "Brewery_id",by.y = "Brew_ID")

1.0.5 Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare

df_acom_bebrew_1 = df_beerbre_unp %>% group_by(State,Name_State) %>% summarize(Median_ABV = mean(ABV),Median_IBU = mean(IBU))

# df_acom_bebrew_1 %>% ggplot(aes(x=State,color=State))+geom_bar()+labs(title = "Alcohol by Volume",subtitle = "Alcohol by Volume average by State")+coord_flip()

1.0.6 Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer

df_sort_1 = arrange(df_beerbre_unp,desc(ABV)) %>% head(n = 1)
sprintf("The state that has the maximum ABV is %s-%f",df_sort_1$Name_State,df_sort_1$ABV)
## [1] "The state that has the maximum ABV is Colorado-0.128000"

1.0.7 Comment on the summary statistics and distribution of the ABV variable

1.0.7.1 Accordint to the histogram we can notice that the data seems normally distributed

df_beerbre_unp %>% ggplot(aes(x=ABV,fill=State))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer

df_beerbre_unp %>% select(IBU,ABV) %>% ggpairs(columnLabels = c("ABV","IBU"))

###Activity 8, Difference with respect to IBU and ABV IPA and ALE

df_beerbre_fil0 = filter(df_beerbre_unp,str_detect(df_beerbre_unp$Style,regex("IPA|ALE",ignore_case = TRUE)))

df_beerbre_fil1 = mutate(df_beerbre_fil0,Type= ifelse(str_detect(Style,regex("IPA",ignore_case = TRUE)),"IPA","ALE"))


df_beerbre_fil1 %>% ggplot(aes(x=ABV,y=IBU,color=Type))+geom_point()+
  labs(title = "Relationship IBU and AVB",subtitle = "Relationship IBU/AVB by Beer Type")+xlab("Alcohol by Volume (ABV)")+ylab("International Bitterness Unit(IBU)")

# df_beerbre_fil1 = filter(df_beerbre_unp,str_detect(df_beerbre_unp$Style,regex("IPA",ignore_case = TRUE)))
# 
# df_beerbre_fil2 = filter(df_beerbre_unp,str_detect(df_beerbre_unp$Style,regex("ALE",ignore_case = TRUE)))

nr_percentage = 0.7
nr_observations = nrow(df_beerbre_fil1)
nr_k = 5

lst_index_beer = sample(nr_observations,round(nr_observations * nr_percentage))
df_train = df_beerbre_fil1[lst_index_beer,]
df_test = df_beerbre_fil1[-lst_index_beer,]

knn_result = knn(df_train[,c(4,5)],df_test[,c(4,5)],df_train$Type,prob = TRUE,k=nr_k)
co_table =  table(knn_result,df_test$Type)
confusionMatrix(co_table)
## Confusion Matrix and Statistics
## 
##           
## knn_result ALE IPA
##        ALE 268  35
##        IPA  41 129
##                                           
##                Accuracy : 0.8393          
##                  95% CI : (0.8031, 0.8713)
##     No Information Rate : 0.6533          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6483          
##                                           
##  Mcnemar's Test P-Value : 0.5663          
##                                           
##             Sensitivity : 0.8673          
##             Specificity : 0.7866          
##          Pos Pred Value : 0.8845          
##          Neg Pred Value : 0.7588          
##              Prevalence : 0.6533          
##          Detection Rate : 0.5666          
##    Detection Prevalence : 0.6406          
##       Balanced Accuracy : 0.8269          
##                                           
##        'Positive' Class : ALE             
## 
#When it comes down to it, a Pale Ale should have a nice hop character but medium build, whereas IPAs tend to have higher ABV and IBU.
df_sum_1 = group_by(df_beerbre_unp,State,Name_State,Style) %>% summarize(count_style = n())
## `summarise()` has grouped output by 'State', 'Name_State'. You can override
## using the `.groups` argument.
df_sum_1 = mutate(df_sum_1,flavor_type = ifelse(str_detect(Style,regex("IPA",ignore_case = TRUE)),"Ipa","Pale"))
df_sum_1 = mutate(df_sum_1,general_stype = ifelse(str_detect(Style,regex("lager",ignore_case = TRUE)),"Lager","Ale"))

df_sum_1 %>% ggplot(aes(x=Name_State,y=count_style,fill=flavor_type))+geom_bar(stat = "identity",position = "stack")+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

df_sum_2 = group_by(df_beerbre_unp,State,Name_State,Style) %>% summarize(count_style = n())
## `summarise()` has grouped output by 'State', 'Name_State'. You can override
## using the `.groups` argument.
options(repr.plot.width = 1000, repr.plot.height =2)

gfr = df_sum_2 %>% ggplot(aes(x=Style,y=count_style,fill = State ))+geom_bar(stat = "identity",position = "stack")+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),legend.position = "left",text = element_text(size = 8))

ggplotly(gfr)